1 Load Libraries

library(statsr)
library(dplyr)
library(MASS)
library(BAS)
library(ggplot2)
library(devtools)
library(gridExtra)
library(grid)
library(GGally)
library(PreProcess)

2 Exploratory Data Analysis

load("ames_train.Rdata")
load("ames_test.Rdata")
load("ames_validation.Rdata")
nrow(ames_train)
## [1] 1000
nrow(ames_test)
## [1] 817
nrow(ames_validation)
## [1] 763

Let us first take a look into the structure and the dimension of the data set.

# Show the entire structure of the data set
ames_all <- rbind(ames_train, ames_test, ames_validation)
str(ames_all)
## tibble [2,580 × 81] (S3: tbl_df/tbl/data.frame)
##  $ PID            : int [1:2580] 909176150 905476230 911128020 535377150 534177230 908128060 902135020 528228540 923426010 908186050 ...
##  $ area           : int [1:2580] 856 1049 1001 1039 1665 1922 936 1246 889 1072 ...
##  $ price          : int [1:2580] 126000 139500 124900 114000 227000 198500 93000 187687 137500 140000 ...
##  $ MS.SubClass    : int [1:2580] 30 120 30 70 60 85 20 20 20 180 ...
##  $ MS.Zoning      : Factor w/ 7 levels "A (agr)","C (all)",..: 6 6 2 6 6 6 7 6 6 7 ...
##  $ Lot.Frontage   : int [1:2580] NA 42 60 80 70 64 60 53 74 35 ...
##  $ Lot.Area       : int [1:2580] 7890 4235 6060 8146 8400 7301 6000 3710 12395 3675 ...
##  $ Street         : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Alley          : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA 2 NA NA NA ...
##  $ Lot.Shape      : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Land.Contour   : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 1 4 4 4 ...
##  $ Utilities      : Factor w/ 3 levels "AllPub","NoSeWa",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lot.Config     : Factor w/ 5 levels "Corner","CulDSac",..: 1 5 5 1 5 1 5 5 1 5 ...
##  $ Land.Slope     : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 2 1 1 1 ...
##  $ Neighborhood   : Factor w/ 28 levels "Blmngtn","Blueste",..: 26 8 12 21 20 8 21 1 15 8 ...
##  $ Condition.1    : Factor w/ 9 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Condition.2    : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Bldg.Type      : Factor w/ 5 levels "1Fam","2fmCon",..: 1 5 1 1 1 1 2 1 1 5 ...
##  $ House.Style    : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 3 3 3 6 6 7 3 3 3 7 ...
##  $ Overall.Qual   : int [1:2580] 6 5 5 4 8 7 4 7 5 6 ...
##  $ Overall.Cond   : int [1:2580] 6 5 9 8 6 5 4 5 6 5 ...
##  $ Year.Built     : int [1:2580] 1939 1984 1930 1900 2001 2003 1953 2007 1984 2005 ...
##  $ Year.Remod.Add : int [1:2580] 1950 1984 2007 2003 2001 2003 1953 2008 1984 2005 ...
##  $ Roof.Style     : Factor w/ 6 levels "Flat","Gable",..: 2 2 4 2 2 2 2 2 2 2 ...
##  $ Roof.Matl      : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Exterior.1st   : Factor w/ 16 levels "AsbShng","AsphShn",..: 15 7 9 9 14 7 9 16 7 14 ...
##  $ Exterior.2nd   : Factor w/ 17 levels "AsbShng","AsphShn",..: 16 7 9 9 15 7 9 17 11 15 ...
##  $ Mas.Vnr.Type   : Factor w/ 6 levels "","BrkCmn","BrkFace",..: 5 3 5 5 5 3 5 3 5 6 ...
##  $ Mas.Vnr.Area   : int [1:2580] 0 149 0 0 0 500 0 20 0 76 ...
##  $ Exter.Qual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 4 3 3 3 3 3 2 3 4 4 ...
##  $ Exter.Cond     : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 3 5 5 5 5 5 5 ...
##  $ Foundation     : Factor w/ 6 levels "BrkTil","CBlock",..: 2 2 1 1 3 4 2 3 2 3 ...
##  $ Bsmt.Qual      : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 4 6 3 4 NA 3 4 6 4 ...
##  $ Bsmt.Cond      : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 6 6 6 NA 6 6 6 6 ...
##  $ Bsmt.Exposure  : Factor w/ 5 levels "","Av","Gd","Mn",..: 5 4 5 5 5 NA 5 3 5 3 ...
##  $ BsmtFin.Type.1 : Factor w/ 7 levels "","ALQ","BLQ",..: 6 4 2 7 4 NA 7 7 2 4 ...
##  $ BsmtFin.SF.1   : int [1:2580] 238 552 737 0 643 0 0 0 647 467 ...
##  $ BsmtFin.Type.2 : Factor w/ 7 levels "","ALQ","BLQ",..: 7 2 7 7 7 NA 7 7 7 7 ...
##  $ BsmtFin.SF.2   : int [1:2580] 0 393 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int [1:2580] 618 104 100 405 167 0 936 1146 217 80 ...
##  $ Total.Bsmt.SF  : int [1:2580] 856 1049 837 405 810 0 936 1146 864 547 ...
##  $ Heating        : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Heating.QC     : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 1 3 1 1 5 1 5 1 ...
##  $ Central.Air    : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 1 2 2 2 ...
##  $ Electrical     : Factor w/ 6 levels "","FuseA","FuseF",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ X1st.Flr.SF    : int [1:2580] 856 1049 1001 717 810 495 936 1246 889 1072 ...
##  $ X2nd.Flr.SF    : int [1:2580] 0 0 0 322 855 1427 0 0 0 0 ...
##  $ Low.Qual.Fin.SF: int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Full.Bath : int [1:2580] 1 1 0 0 1 0 0 0 0 1 ...
##  $ Bsmt.Half.Bath : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int [1:2580] 1 2 1 1 2 3 1 2 1 1 ...
##  $ Half.Bath      : int [1:2580] 0 0 0 0 1 0 0 0 0 0 ...
##  $ Bedroom.AbvGr  : int [1:2580] 2 2 2 2 3 4 2 2 3 2 ...
##  $ Kitchen.AbvGr  : int [1:2580] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 3 3 5 3 3 5 3 5 3 ...
##  $ TotRms.AbvGrd  : int [1:2580] 4 5 5 6 6 7 4 5 6 5 ...
##  $ Functional     : Factor w/ 8 levels "Maj1","Maj2",..: 8 8 8 8 8 8 4 8 8 8 ...
##  $ Fireplaces     : int [1:2580] 1 0 0 0 0 1 0 1 0 0 ...
##  $ Fireplace.Qu   : Factor w/ 5 levels "Ex","Fa","Gd",..: 3 NA NA NA NA 1 NA 3 NA NA ...
##  $ Garage.Type    : Factor w/ 6 levels "2Types","Attchd",..: 6 2 6 6 2 4 6 2 2 3 ...
##  $ Garage.Yr.Blt  : int [1:2580] 1939 1984 1930 1940 2001 2003 1974 2007 1984 2005 ...
##  $ Garage.Finish  : Factor w/ 4 levels "","Fin","RFn",..: 4 2 4 4 2 3 4 2 4 2 ...
##  $ Garage.Cars    : int [1:2580] 2 1 1 1 2 2 2 2 2 2 ...
##  $ Garage.Area    : int [1:2580] 399 266 216 281 528 672 576 428 484 525 ...
##  $ Garage.Qual    : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Garage.Cond    : Factor w/ 6 levels "","Ex","Fa","Gd",..: 6 6 5 6 6 6 6 6 6 6 ...
##  $ Paved.Drive    : Factor w/ 3 levels "N","P","Y": 3 3 1 1 3 3 3 3 3 3 ...
##  $ Wood.Deck.SF   : int [1:2580] 0 0 154 0 0 0 0 100 0 0 ...
##  $ Open.Porch.SF  : int [1:2580] 0 105 0 0 45 0 32 24 0 44 ...
##  $ Enclosed.Porch : int [1:2580] 0 0 42 168 0 177 112 0 0 0 ...
##  $ X3Ssn.Porch    : int [1:2580] 0 0 86 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int [1:2580] 166 0 0 111 0 0 0 0 0 0 ...
##  $ Pool.Area      : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Pool.QC        : Factor w/ 4 levels "Ex","Fa","Gd",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Fence          : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Misc.Feature   : Factor w/ 5 levels "Elev","Gar2",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Misc.Val       : int [1:2580] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Mo.Sold        : int [1:2580] 3 2 11 5 11 7 2 3 4 5 ...
##  $ Yr.Sold        : int [1:2580] 2010 2009 2007 2009 2009 2009 2009 2008 2008 2007 ...
##  $ Sale.Type      : Factor w/ 10 levels "COD","Con","ConLD",..: 10 10 10 10 10 3 10 7 10 10 ...
##  $ Sale.Condition : Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 5 5 5 5 6 5 5 ...
# Show the number of numeric and factor variables
nums <- unlist(lapply(ames_all, is.numeric))
fcts <- unlist(lapply(ames_all, is.factor))
cat(sum(nums), '\t', sum(fcts))
## 38    43

There are 2,580 rows with 80 columns in the data set (including ames_train, ames_test and ames_validation), of which 38 are numerical and 43 are categorical.

Response Variable

We first take a look into our response variable price:

# Function to plot Normal QQ
qqplot.data <- function (vec) # argument: vector of numbers
{
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  d <- data.frame(resids = vec)
  ggplot(d, aes(sample = resids)) + stat_qq() + geom_abline(slope = slope, intercept = int) + xlab("Theoretical Quantiles") + ylab("Sample Quantiles")
}

p1 <- ggplot(ames_all, aes(price)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
p2 <- ggplot(ames_all, aes(log(price))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
n1 <- qqplot.data(ames_all$price)
n2 <- qqplot.data(log(ames_all$price))

grid.arrange(
  p1, p2, n1, n2,
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

summary(ames_all$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12789  129975  159900  178060  209625  755000
summary(log(ames_all$price))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.456  11.775  11.982  12.013  12.253  13.534
library(moments)
cat('Skewness and kurtosis of price:\t\t', skewness(ames_all$price), '\t',  kurtosis(ames_all$price), '\n') 
## Skewness and kurtosis of price:       1.759778    8.419952
cat('Skewness and kurtosis of ln(price):\t', skewness(log(ames_all$price)), '\t',  kurtosis(log(ames_all$price)), '\n') 
## Skewness and kurtosis of ln(price):   0.04124497      4.393831

The distribution of price:

Completeness

Next, we evaluate the completeness of the data set and impute any missing values.

# Show all columns with NA values
NAcolumn <- which(colSums(is.na(ames_all)) > 0)
sort(colSums(sapply(ames_all[NAcolumn], is.na)), decreasing = TRUE)/nrow(ames_all)
##        Pool.QC   Misc.Feature          Alley          Fence   Fireplace.Qu 
##   0.9965116279   0.9624031008   0.9348837209   0.7965116279   0.4810077519 
##   Lot.Frontage  Garage.Yr.Blt    Garage.Qual    Garage.Cond    Garage.Type 
##   0.1790697674   0.0500000000   0.0496124031   0.0496124031   0.0492248062 
##  Garage.Finish      Bsmt.Qual      Bsmt.Cond  Bsmt.Exposure BsmtFin.Type.1 
##   0.0492248062   0.0263565891   0.0263565891   0.0263565891   0.0263565891 
## BsmtFin.Type.2   Mas.Vnr.Area Bsmt.Full.Bath Bsmt.Half.Bath   BsmtFin.SF.1 
##   0.0263565891   0.0054263566   0.0007751938   0.0007751938   0.0003875969 
##   BsmtFin.SF.2    Bsmt.Unf.SF  Total.Bsmt.SF    Garage.Cars    Garage.Area 
##   0.0003875969   0.0003875969   0.0003875969   0.0003875969   0.0003875969

Handling NA

library(forcats)

# For missing Garage.Yr.Blt, we provide Year.Built
ames_all$Garage.Yr.Blt[is.na(ames_all$Garage.Yr.Blt)] <- ames_all$Year.Built[is.na(ames_all$Garage.Yr.Blt)]

# For missing all other numerical variables, we provide 0
ames_all <- ames_all %>% mutate_if(is.numeric, ~replace(., is.na(.), 0))

# For missing all categorical variables, we provide value 'None' 
ames_all <- ames_all %>% mutate_if(is.factor, fct_explicit_na, na_level = 'None')

# Drop a factor column with only 1 level
ames_all <- ames_all %>% dplyr::select(-Utilities) %>% dplyr::select(-Sale.Condition)

# Factorize a discrete int column 
ames_all$MS.SubClass <- as.factor(ames_all$MS.SubClass)

Detailed EDA

We use random forest with 100 trees to quickly evaluate the important features.

# Run a quick random forest with 100 trees to find important features
library(randomForest)
set.seed(1)
model_rf <- randomForest(log(price) ~ . , data=ames_all[1:nrow(ames_train),], ntree=100, importance=TRUE)
imp_RF <- importance(model_rf)
imp_DF <- data.frame(Variables = row.names(imp_RF), MSE = imp_RF[,1])
imp_DF <- imp_DF[order(imp_DF$MSE, decreasing = TRUE),]
imp_var <- c(imp_DF[1:30,'Variables'])
ggplot(imp_DF[1:30,], aes(x=reorder(Variables, MSE), y=MSE, fill=MSE)) + geom_bar(stat = 'identity') + labs(x = 'Variables', y= '% increase in MSE if variable values are randomely shuffled (permuted)') + coord_flip() + theme(legend.position="none")

We proceed with our analysis using the top 30 important features that model_rf suggests.

ames_all <- ames_all %>% dplyr::select(price, all_of(imp_var))
num_variables <- dplyr::select_if(ames_all[1:nrow(ames_train),], is.numeric)
cat_variables <- dplyr::select_if(ames_all[1:nrow(ames_train),], is.factor)

We analyze the correlation between these fields with the response variable and within themselves. We look at the numerical variables first:

library(corrplot)
## corrplot 0.84 loaded
# Compute correlation between numerical values
correlationMatrix <- cor(num_variables)

# Sort on decreasing correlations with price
cor_sorted <- as.matrix(sort(correlationMatrix[, 'price'], decreasing = TRUE))

# Select only high correlations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
correlationMatrix <- correlationMatrix[CorHigh, CorHigh]
corrplot.mixed(correlationMatrix, tl.col="black", tl.pos="lt")

The color-coded correlation matrix (above) shows numerical attributes that have absolute correlation strength greater than 0.5 to price. Overall.Qual shows strong correlation (>0.75) with price. Other features like area, Total.Bsmt.SF and Full.Bath show medium to strong correlation (0.5-0.75).

We exclude X1st.Flr.SF, Garage.Area, Garage.Yr.Blt and TotRms.AbvGrd due to multicollinearity (correlation strength larger than 0.75 with another independent variable).

ames_all <- ames_all %>% 
  dplyr::select(all_of(c(CorHigh)), colnames(cat_variables)) %>% 
  dplyr::select(-X1st.Flr.SF, -Garage.Area, -Garage.Yr.Blt, -TotRms.AbvGrd)

We note that the following combinations of variables exhibit absolute correlation strength (0.5-0.75): Overall.Qual & area, Overall.Qual & Total.Bsmt.SF, Overall.Qual & Garage.Cars, Overall.Qual & Year.Built, Overall.Qual & Year.Remod.Add, Overall.Qual & Full.Bath, area & Garage.Cars, area & Full.Bath, Garage.Cars & Year.Built, Year.Built & Year.Remod.Add.

# Numerical variables
names(dplyr::select_if(ames_all[1:nrow(ames_train),], is.numeric))
## [1] "price"          "Overall.Qual"   "area"           "Total.Bsmt.SF" 
## [5] "Garage.Cars"    "Year.Built"     "Year.Remod.Add" "Full.Bath"     
## [9] "Fireplaces"
# Categorical variables
names(dplyr::select_if(ames_all[1:nrow(ames_train),], is.factor))
##  [1] "Neighborhood"   "MS.SubClass"    "BsmtFin.Type.1" "Bsmt.Exposure" 
##  [5] "Kitchen.Qual"   "Garage.Type"    "Bsmt.Qual"      "MS.Zoning"     
##  [9] "Fireplace.Qu"   "Exterior.1st"   "House.Style"    "Central.Air"
p1 <- ggplot(aes(x=Overall.Qual, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p2 <- ggplot(aes(x=area, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p3 <- ggplot(aes(x=Total.Bsmt.SF, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p4 <- ggplot(aes(x=Garage.Cars, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p5 <- ggplot(aes(x=Year.Built, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p6 <- ggplot(aes(x=Year.Remod.Add, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p7 <- ggplot(aes(x=Full.Bath, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")
p8 <- ggplot(aes(x=Fireplaces, y=log(price)), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=loess, fill="red", color="red") + geom_smooth(method=lm, fill="blue", color="blue")

# Create a grid of plots
grid.arrange(
  p1, p2, p3, p4, p5, p6, p7, p8,
  nrow = 3,
  top = "log(price)",
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

Next, we look into the categorical features. We analyze the distribution of each variables using box plots.

b1 <- ggplot(aes(x=reorder(Neighborhood, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood")
b2 <- ggplot(aes(x=reorder(MS.SubClass, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("MS.SubClass")
b3 <- ggplot(aes(x=reorder(BsmtFin.Type.1, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("BsmtFin.Type.1")
b4 <- ggplot(aes(x=reorder(Garage.Type, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Garage.Type")
b5 <- ggplot(aes(x=reorder(Bsmt.Qual, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Bsmt.Qual")
b6 <- ggplot(aes(x=reorder(Kitchen.Qual, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Kitchen.Qual")
b7 <- ggplot(aes(x=reorder(Central.Air, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Central.Air")
b8 <- ggplot(aes(x=reorder(Bsmt.Exposure, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Bsmt.Exposure")
b9 <- ggplot(aes(x=reorder(Fireplace.Qu, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Fireplace.Qu")
b10 <- ggplot(aes(x=reorder(MS.Zoning, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("MS.Zoning")
b11 <- ggplot(aes(x=reorder(Exterior.1st, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("Exterior.1st") + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
b12 <- ggplot(aes(x=reorder(House.Style, log(price), fun=median), y=log(price)), data=ames_all) + geom_boxplot(alpha = 0.75, color='blue') + xlab("House.Style")

h1 <- ggplot(ames_all, aes(reorder(Neighborhood, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h2 <- ggplot(ames_all, aes(x=reorder(MS.SubClass, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("MS.SubClass") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h3 <- ggplot(ames_all, aes(x=reorder(BsmtFin.Type.1, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("BsmtFin.Type.1") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h4 <- ggplot(ames_all, aes(x=reorder(Garage.Type, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Garage.Type") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h5 <- ggplot(ames_all, aes(x=reorder(Bsmt.Qual, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Bsmt.Qual") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h6 <- ggplot(ames_all, aes(x=reorder(Kitchen.Qual, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Kitchen.Qual") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h7 <- ggplot(ames_all, aes(x=reorder(Central.Air, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Central.Air") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h8 <- ggplot(ames_all, aes(x=reorder(Bsmt.Exposure, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Bsmt.Exposure") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h9 <- ggplot(ames_all, aes(x=reorder(Fireplace.Qu, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Fireplace.Qu") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h10 <- ggplot(ames_all, aes(x=reorder(MS.Zoning, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("MS.Zoning") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)
h11 <- ggplot(ames_all, aes(x=reorder(Exterior.1st, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("Exterior.1st") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
h12 <- ggplot(ames_all, aes(x=reorder(House.Style, log(price), fun=median))) + geom_bar(alpha = 0.75, color='blue', fill='blue') + xlab("House.Style") + geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)

# Create a grid of plots
grid.arrange(
  b1, h1, b2, h2, b3, h3, b4, h4, b5, h5, b6, h6, b7, h7, b8, h8, b9, h9, b10, h10, b11, h11, b12, h12,
  nrow = 12,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

Distribution of the important categorical variables such as Neighborhood, MS.SubClass, Fireplace.Qu and etc. all exhibit clear group dependencies.

We run pairwise chi-square test for all combinations of categorical variables in ames_all to evaluate the strength of correlation.

# Run chi-square test for all combinations of categorical variables
library(plyr)
cat_df <- droplevels(cat_variables)  
combos <- combn(ncol(cat_df), 2)
cat_corelations <- adply(combos, 2, function(x) {
  column_one <- names(cat_df[, x[1]])
  column_two <- names(cat_df[, x[2]])
  mydata <- data.frame(cat_df[, x[1]], cat_df[, x[2]])
  mytab <- table(mydata)
  test <- chisq.test(mytab)
  
  out <- data.frame('Column.A' = column_one, 
                    'Column.B' = column_two, 
                    'Chi.Square' = test$statistic,
                    'p.value' = test$p.value)
  
  return(out)
})

cat_corelations %>% arrange(p.value)
##    X1       Column.A       Column.B Chi.Square       p.value
## 1  20    MS.SubClass    House.Style 4904.58739  0.000000e+00
## 2  25 BsmtFin.Type.1      Bsmt.Qual 2419.49701  0.000000e+00
## 3  33  Bsmt.Exposure      Bsmt.Qual 1643.62391  0.000000e+00
## 4  22 BsmtFin.Type.1  Bsmt.Exposure 1632.67623 3.898178e-321
## 5   7   Neighborhood      MS.Zoning 1675.62512 1.429146e-266
## 6   1   Neighborhood    MS.SubClass 2113.30616 8.995274e-244
## 7   9   Neighborhood   Exterior.1st 1551.70372 1.141377e-172
## 8   6   Neighborhood      Bsmt.Qual 1074.37787 6.709832e-137
## 9  40   Kitchen.Qual      Bsmt.Qual  606.46571 1.052008e-112
## 10 15    MS.SubClass    Garage.Type  758.11043 4.269185e-109
## 11  4   Neighborhood   Kitchen.Qual  740.79723  1.031665e-96
## 12 17    MS.SubClass      MS.Zoning  557.65857  4.300817e-77
## 13 10   Neighborhood    House.Style  741.90883  4.709411e-77
## 14 19    MS.SubClass   Exterior.1st  695.53148  8.635330e-70
## 15  5   Neighborhood    Garage.Type  692.96301  1.064479e-68
## 16  8   Neighborhood   Fireplace.Qu  631.69723  6.163565e-67
## 17  2   Neighborhood BsmtFin.Type.1  723.45552  1.287607e-65
## 18 16    MS.SubClass      Bsmt.Qual  485.74749  7.539032e-58
## 19 54      Bsmt.Qual   Exterior.1st  414.18138  6.777544e-52
## 20 50    Garage.Type    House.Style  311.76210  1.196497e-45
## 21 23 BsmtFin.Type.1   Kitchen.Qual  277.95727  5.606503e-43
## 22 12    MS.SubClass BsmtFin.Type.1  427.76070  9.481323e-43
## 23 42   Kitchen.Qual   Fireplace.Qu  241.11458  7.030731e-40
## 24 52      Bsmt.Qual      MS.Zoning  266.29166  1.055619e-39
## 25 46    Garage.Type      Bsmt.Qual  275.67325  1.030660e-38
## 26 43   Kitchen.Qual   Exterior.1st  290.31705  5.201018e-38
## 27 53      Bsmt.Qual   Fireplace.Qu  245.68967  1.027544e-35
## 28 59      MS.Zoning    House.Style  244.80707  1.519745e-35
## 29 51    Garage.Type    Central.Air  171.84349  1.827871e-34
## 30 39   Kitchen.Qual    Garage.Type  218.20485  3.006844e-33
## 31 13    MS.SubClass  Bsmt.Exposure  309.77790  6.757078e-32
## 32 18    MS.SubClass   Fireplace.Qu  305.76809  3.233467e-31
## 33 47    Garage.Type      MS.Zoning  215.84774  5.154242e-30
## 34 21    MS.SubClass    Central.Air  168.75901  1.219432e-28
## 35 14    MS.SubClass   Kitchen.Qual  262.82890  1.558278e-28
## 36 37  Bsmt.Exposure    House.Style  203.87167  9.319490e-28
## 37 28 BsmtFin.Type.1   Exterior.1st  296.06951  1.978451e-27
## 38  3   Neighborhood  Bsmt.Exposure  380.94294  1.824458e-26
## 39 48    Garage.Type   Fireplace.Qu  194.19349  6.006762e-26
## 40 49    Garage.Type   Exterior.1st  250.29033  2.977990e-23
## 41 61   Fireplace.Qu   Exterior.1st  225.93296  1.373295e-22
## 42 60      MS.Zoning    Central.Air  107.36123  1.479014e-21
## 43 62   Fireplace.Qu    House.Style  170.23248  1.552867e-21
## 44 29 BsmtFin.Type.1    House.Style  190.65112  7.933891e-21
## 45 55      Bsmt.Qual    House.Style  178.01730  1.056784e-20
## 46 58      MS.Zoning   Exterior.1st  214.03252  1.277635e-20
## 47 56      Bsmt.Qual    Central.Air  104.95682  2.314172e-20
## 48 45   Kitchen.Qual    Central.Air   97.93501  2.706262e-20
## 49 11   Neighborhood    Central.Air  144.21609  2.382353e-18
## 50 64   Exterior.1st    House.Style  217.17778  5.184934e-18
## 51 24 BsmtFin.Type.1    Garage.Type  165.47658  1.423255e-16
## 52 32  Bsmt.Exposure    Garage.Type  138.58803  6.783282e-16
## 53 65   Exterior.1st    Central.Air   91.17590  9.804165e-15
## 54 30 BsmtFin.Type.1    Central.Air   78.25601  3.122311e-14
## 55 44   Kitchen.Qual    House.Style  115.30847  6.590662e-14
## 56 41   Kitchen.Qual      MS.Zoning  105.03690  1.564068e-13
## 57 26 BsmtFin.Type.1      MS.Zoning  132.96944  2.451758e-13
## 58 27 BsmtFin.Type.1   Fireplace.Qu  128.25194  1.445633e-12
## 59 31  Bsmt.Exposure   Kitchen.Qual   98.49227  2.342261e-12
## 60 38  Bsmt.Exposure    Central.Air   52.50011  4.259689e-10
## 61 57      MS.Zoning   Fireplace.Qu   80.86189  8.355346e-08
## 62 63   Fireplace.Qu    Central.Air   32.51091  4.705818e-06
## 63 35  Bsmt.Exposure   Fireplace.Qu   62.81626  4.231599e-05
## 64 34  Bsmt.Exposure      MS.Zoning   58.57374  1.639107e-04
## 65 66    House.Style    Central.Air   21.16085  1.716426e-03
## 66 36  Bsmt.Exposure   Exterior.1st   84.65377  6.253605e-03

The output table shows that there are statistically significant dependencies between all categorical variables (p.value ~ 0).

3 Feature Engineering

3.1 Transformation

area

We take the log transformation of the variable area as the distribution becomes more symmetric (histogram below), the relationship to log(price) becomes more linear (scatter plot below) and the R-squared for a simple linear regression on log(price) has a higher score after the transformation.

# Compare the distributions of area and log(area)
h1 <- ggplot(data=ames_all, aes(area)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
h2 <- ggplot(data=ames_all, aes(log(area))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
s1 <- ggplot(data=ames_all, aes(x=area, y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
s2 <- ggplot(data=ames_all, aes(x=log(area), y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
grid.arrange(
  h1, h2, s1, s2, 
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

# Compare skewness and kurtosis of area and log(area)
cat('Skewness of area:\t', skewness(ames_all$area), '\n') 
## Skewness of area:     0.9794948
cat('Skewness of ln(area):\t', skewness(log(ames_all$area)), '\n') 
## Skewness of ln(area):     -0.05829055
# Compare R squared of simple linear regression using area and log(area)
nor <- lm(log(price) ~ area, data=ames_all[1:nrow(ames_train),])
log <- lm(log(price) ~ log(area), data=ames_all[1:nrow(ames_train),])
cat('R squared for area:\t', summary(nor)$r.squared, '\n') 
## R squared for area:   0.5109562
cat('R squared for ln(area):\t', summary(log)$r.squared, '\n')
## R squared for ln(area):   0.5465901
par(mfrow = c(2, 2))

# Residual analysis for area
plot(nor, which=c(2,5))

# Residual analysis for log(area)
plot(log, which=c(2,5))

ames_all <- ames_all %>% mutate(Log.area = log(area)) %>% dplyr::select(-area)

Total.Bsmt.SF

We will not take the log of Total.Bsmt.SF based on the following analysis.

# Compare the distributions of Total.Bsmt.SF and log(Total.Bsmt.SF)
h1 <- ggplot(data=ames_all, aes(Total.Bsmt.SF)) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
h2 <- ggplot(data=ames_all, aes(log(Total.Bsmt.SF+1))) + geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.75, color='blue', fill='blue') + geom_density()
s1 <- ggplot(data=ames_all, aes(x=Total.Bsmt.SF, y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
s2 <- ggplot(data=ames_all, aes(x=log(Total.Bsmt.SF+1), y=log(price))) + geom_point(alpha=0.75)+geom_smooth(method=lm, fill="blue", color="blue")
grid.arrange(
  h1, h2, s1, s2, 
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

# Compare skewness of Total.Bsmt.SF and log(Total.Bsmt.SF)
cat('Skewness of BsmtSF:\t', skewness(ames_all$Total.Bsmt.SF), '\n') 
## Skewness of BsmtSF:   0.5101172
cat('Skewness of ln(BsmtSF):\t', skewness(log(ames_all$Total.Bsmt.SF)), '\n') 
## Skewness of ln(BsmtSF):   NaN
# Compare R squared of simple linear regression using Total.Bsmt.SF and log(Total.Bsmt.SF)
nor <- lm(log(price) ~ Total.Bsmt.SF, data=ames_all[1:nrow(ames_train),])
log <- lm(log(price) ~ log(Total.Bsmt.SF+1), data=ames_all[1:nrow(ames_train),])
cat('R squared for Total.Bsmt.SF:\t', summary(nor)$r.squared, '\n') 
## R squared for Total.Bsmt.SF:  0.4476925
cat('R squared for ln(Total.Bsmt.SF):\t', summary(log)$r.squared, '\n')
## R squared for ln(Total.Bsmt.SF):  0.1520897
par(mfrow = c(2, 2))

# Residual analysis for area
plot(nor, which=c(2,5))

# Residual analysis for log(area)
plot(log, which=c(2,5))

Neighborhood

We reduce the cardinality of the column Neighborhood for a computational cost reason.

b1 <- ggplot(ames_all, aes(x = reorder(Neighborhood, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=Neighborhood), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood")

ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('NoRidge', 'NridgHt', 'StoneBr', 'GrnHill', 'Veenker', 'Somerst', 'Timber')] <- 3
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('ClearCr', 'CollgCr', 'Crawfor', 'Greens', 'Blmngtn', 'NWAmes', 'SawyerW')] <- 2
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('Gilbert', 'Mitchel', 'NPkVill', 'NAmes', 'Landmrk', 'SWISU', 'Sawyer')] <- 1
ames_all$Neighborhood.Binned[ames_all$Neighborhood %in% c('Blueste', 'BrkSide', 'Edwards', 'OldTown', 'IDOTRR', 'BrDale', 'MeadowV')] <- 0

ames_all$Neighborhood.Binned <- as.factor(ames_all$Neighborhood.Binned)

b2 <- ggplot(ames_all, aes(x = reorder(Neighborhood.Binned, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=Neighborhood.Binned), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("Neighborhood.Binned")

grid.arrange(
  b1, b2,
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

ames_all <- ames_all %>% dplyr::select(-Neighborhood)

MS.SubClass

Reduce the cardinality of the column MS.SubClass.

b1 <- ggplot(ames_all, aes(x = reorder(MS.SubClass, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=MS.SubClass), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("MS.SubClass")

ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('60', '120', '80', '75')] <- 3
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('20', '85', '150', '70')] <- 2
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('90', '160', '40', '50')] <- 1
ames_all$MS.SubClass.Binned[ames_all$MS.SubClass %in% c('190', '45', '30', '180')] <- 0
ames_all$MS.SubClass.Binned <- as.factor(ames_all$MS.SubClass.Binned)

b2 <- ggplot(ames_all, aes(x = reorder(MS.SubClass.Binned, log(price), fun = median), y = log(price))) + geom_boxplot(aes(alpha = 0.75, fill=MS.SubClass.Binned), alpha=0.5) + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + xlab("MS.SubClass.Binned")

grid.arrange(
  b1, b2,
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

ames_all <- ames_all %>% dplyr::select(-MS.SubClass)

Unimportant Variables

ames_train %>% dplyr::select(-price, -all_of(imp_var)) %>% colnames()
##  [1] "Lot.Frontage"    "Street"          "Alley"           "Lot.Shape"      
##  [5] "Land.Contour"    "Utilities"       "Lot.Config"      "Land.Slope"     
##  [9] "Condition.1"     "Condition.2"     "Bldg.Type"       "Overall.Cond"   
## [13] "Roof.Style"      "Roof.Matl"       "Exterior.2nd"    "Mas.Vnr.Type"   
## [17] "Mas.Vnr.Area"    "Exter.Qual"      "Exter.Cond"      "Foundation"     
## [21] "Bsmt.Cond"       "BsmtFin.Type.2"  "BsmtFin.SF.2"    "Heating"        
## [25] "Heating.QC"      "Electrical"      "Low.Qual.Fin.SF" "Bsmt.Half.Bath" 
## [29] "Half.Bath"       "Bedroom.AbvGr"   "Kitchen.AbvGr"   "Functional"     
## [33] "Garage.Finish"   "Garage.Qual"     "Garage.Cond"     "Paved.Drive"    
## [37] "Wood.Deck.SF"    "Open.Porch.SF"   "Enclosed.Porch"  "X3Ssn.Porch"    
## [41] "Screen.Porch"    "Pool.Area"       "Pool.QC"         "Fence"          
## [45] "Misc.Feature"    "Misc.Val"        "Mo.Sold"         "Yr.Sold"        
## [49] "Sale.Type"       "Sale.Condition"

3.2 Variable Interaction

The following interaction plots suggest that there could potentially be an interaction between (Log.area and Neighborhood.Binned), (Log.area and MS.SubClass.Binned), (Overall.Qual and Neighborhood.Binned) and (Overall.Qual and MS.SubClass.Binned).

i1 <- ggplot(aes(y=log(price), x=Log.area, color=Neighborhood.Binned, shape=Neighborhood.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i2 <- ggplot(aes(y=log(price), x=Log.area, color=MS.SubClass.Binned, shape=MS.SubClass.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i3 <- ggplot(aes(y=log(price), x=Overall.Qual, color=Neighborhood.Binned, shape=Neighborhood.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)
i4 <- ggplot(aes(y=log(price), x=Overall.Qual, color=MS.SubClass.Binned, shape=MS.SubClass.Binned), data=ames_all) + geom_point(alpha=0.75) + geom_smooth(method=lm, se=FALSE, fullrange=TRUE)

grid.arrange(
  i1, i2, i3, i4,
  nrow = 2,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

We run simple linear regressions to evaluate if the interaction terms are indeed significant. Although the adjusted R squared reduces once the interaction terms are included (for all combinations), the statistical significance of the added interaction terms are still significantly smaller compared to the original variables (first order terms), so we decide against including these in our linear model.

wo.Log.area.Neighborhood.Binned <- lm(log(price) ~ Log.area + Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
w.Log.area.Neighborhood.Binned <- lm(log(price) ~ Log.area*Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Log.area.Neighborhood.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area + Neighborhood.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94180 -0.10576  0.01067  0.12203  0.85480 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.04516    0.16378   43.02   <2e-16 ***
## Log.area              0.64740    0.02307   28.06   <2e-16 ***
## Neighborhood.Binned1  0.24186    0.01852   13.06   <2e-16 ***
## Neighborhood.Binned2  0.34741    0.02097   16.57   <2e-16 ***
## Neighborhood.Binned3  0.60575    0.02266   26.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.216 on 995 degrees of freedom
## Multiple R-squared:  0.7374, Adjusted R-squared:  0.7363 
## F-statistic: 698.4 on 4 and 995 DF,  p-value: < 2.2e-16
summary(w.Log.area.Neighborhood.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area * Neighborhood.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94160 -0.10314  0.00779  0.11598  0.85466 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    7.04111    0.27104  25.978  < 2e-16 ***
## Log.area                       0.64797    0.03827  16.930  < 2e-16 ***
## Neighborhood.Binned1           1.18786    0.41589   2.856  0.00438 ** 
## Neighborhood.Binned2           0.50743    0.44620   1.137  0.25572    
## Neighborhood.Binned3          -1.28155    0.50919  -2.517  0.01200 *  
## Log.area:Neighborhood.Binned1 -0.13265    0.05847  -2.269  0.02349 *  
## Log.area:Neighborhood.Binned2 -0.02182    0.06158  -0.354  0.72317    
## Log.area:Neighborhood.Binned3  0.25188    0.06908   3.646  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2133 on 992 degrees of freedom
## Multiple R-squared:  0.7447, Adjusted R-squared:  0.7429 
## F-statistic: 413.3 on 7 and 992 DF,  p-value: < 2.2e-16
wo.Overall.Qual.Neighborhood.Binned <- lm(log(price) ~ Log.area + Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
w.Overall.Qual.Neighborhood.Binned <- lm(log(price) ~ Log.area*Neighborhood.Binned, data=ames_all[1:nrow(ames_train),])
summary(wo.Overall.Qual.Neighborhood.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area + Neighborhood.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94180 -0.10576  0.01067  0.12203  0.85480 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.04516    0.16378   43.02   <2e-16 ***
## Log.area              0.64740    0.02307   28.06   <2e-16 ***
## Neighborhood.Binned1  0.24186    0.01852   13.06   <2e-16 ***
## Neighborhood.Binned2  0.34741    0.02097   16.57   <2e-16 ***
## Neighborhood.Binned3  0.60575    0.02266   26.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.216 on 995 degrees of freedom
## Multiple R-squared:  0.7374, Adjusted R-squared:  0.7363 
## F-statistic: 698.4 on 4 and 995 DF,  p-value: < 2.2e-16
summary(w.Overall.Qual.Neighborhood.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area * Neighborhood.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94160 -0.10314  0.00779  0.11598  0.85466 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    7.04111    0.27104  25.978  < 2e-16 ***
## Log.area                       0.64797    0.03827  16.930  < 2e-16 ***
## Neighborhood.Binned1           1.18786    0.41589   2.856  0.00438 ** 
## Neighborhood.Binned2           0.50743    0.44620   1.137  0.25572    
## Neighborhood.Binned3          -1.28155    0.50919  -2.517  0.01200 *  
## Log.area:Neighborhood.Binned1 -0.13265    0.05847  -2.269  0.02349 *  
## Log.area:Neighborhood.Binned2 -0.02182    0.06158  -0.354  0.72317    
## Log.area:Neighborhood.Binned3  0.25188    0.06908   3.646  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2133 on 992 degrees of freedom
## Multiple R-squared:  0.7447, Adjusted R-squared:  0.7429 
## F-statistic: 413.3 on 7 and 992 DF,  p-value: < 2.2e-16
wo.Log.area.MS.SubClass.Binned <- lm(log(price) ~ Log.area + MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
w.Log.area.MS.SubClass.Binned <- lm(log(price) ~ Log.area*MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])

summary(wo.Log.area.MS.SubClass.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area + MS.SubClass.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92814 -0.12669  0.00423  0.14758  0.75446 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.50352    0.18443  29.840  < 2e-16 ***
## Log.area             0.87464    0.02653  32.965  < 2e-16 ***
## MS.SubClass.Binned1 -0.07512    0.03475  -2.162   0.0309 *  
## MS.SubClass.Binned2  0.27313    0.03082   8.862  < 2e-16 ***
## MS.SubClass.Binned3  0.24615    0.03429   7.178 1.38e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.248 on 995 degrees of freedom
## Multiple R-squared:  0.6536, Adjusted R-squared:  0.6522 
## F-statistic: 469.3 on 4 and 995 DF,  p-value: < 2.2e-16
summary(w.Log.area.MS.SubClass.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area * MS.SubClass.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97446 -0.12513  0.01265  0.14916  0.73702 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.62291    0.51556  14.786  < 2e-16 ***
## Log.area                      0.56633    0.07490   7.561 9.08e-14 ***
## MS.SubClass.Binned1          -1.77796    0.71912  -2.472   0.0136 *  
## MS.SubClass.Binned2          -3.12799    0.58387  -5.357 1.05e-07 ***
## MS.SubClass.Binned3          -0.87042    0.63298  -1.375   0.1694    
## Log.area:MS.SubClass.Binned1  0.25095    0.10184   2.464   0.0139 *  
## Log.area:MS.SubClass.Binned2  0.48732    0.08410   5.795 9.18e-09 ***
## Log.area:MS.SubClass.Binned3  0.17355    0.08968   1.935   0.0532 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2427 on 992 degrees of freedom
## Multiple R-squared:  0.6693, Adjusted R-squared:  0.6669 
## F-statistic: 286.8 on 7 and 992 DF,  p-value: < 2.2e-16
wo.Overall.Qual.MS.SubClass.Binned <- lm(log(price) ~ Log.area + MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])
w.Overall.Qual.MS.SubClass.Binned <- lm(log(price) ~ Log.area*MS.SubClass.Binned, data=ames_all[1:nrow(ames_train),])

summary(wo.Overall.Qual.MS.SubClass.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area + MS.SubClass.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92814 -0.12669  0.00423  0.14758  0.75446 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.50352    0.18443  29.840  < 2e-16 ***
## Log.area             0.87464    0.02653  32.965  < 2e-16 ***
## MS.SubClass.Binned1 -0.07512    0.03475  -2.162   0.0309 *  
## MS.SubClass.Binned2  0.27313    0.03082   8.862  < 2e-16 ***
## MS.SubClass.Binned3  0.24615    0.03429   7.178 1.38e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.248 on 995 degrees of freedom
## Multiple R-squared:  0.6536, Adjusted R-squared:  0.6522 
## F-statistic: 469.3 on 4 and 995 DF,  p-value: < 2.2e-16
summary(w.Overall.Qual.MS.SubClass.Binned)
## 
## Call:
## lm(formula = log(price) ~ Log.area * MS.SubClass.Binned, data = ames_all[1:nrow(ames_train), 
##     ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97446 -0.12513  0.01265  0.14916  0.73702 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.62291    0.51556  14.786  < 2e-16 ***
## Log.area                      0.56633    0.07490   7.561 9.08e-14 ***
## MS.SubClass.Binned1          -1.77796    0.71912  -2.472   0.0136 *  
## MS.SubClass.Binned2          -3.12799    0.58387  -5.357 1.05e-07 ***
## MS.SubClass.Binned3          -0.87042    0.63298  -1.375   0.1694    
## Log.area:MS.SubClass.Binned1  0.25095    0.10184   2.464   0.0139 *  
## Log.area:MS.SubClass.Binned2  0.48732    0.08410   5.795 9.18e-09 ***
## Log.area:MS.SubClass.Binned3  0.17355    0.08968   1.935   0.0532 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2427 on 992 degrees of freedom
## Multiple R-squared:  0.6693, Adjusted R-squared:  0.6669 
## F-statistic: 286.8 on 7 and 992 DF,  p-value: < 2.2e-16

4 Preprocess

4.1 Normalization

We normalize (scale and center) all numerical attributes.

library(caret)
## Loading required package: lattice
preProc <- preProcess(ames_all[,-1], method=c("center", "scale"))
ames_all <- predict(preProc, ames_all)

4.2 Outliers

We fit a simple multiple linear regression model on the training dataset and evaluate the Cook’s distance of the observation points.

outlier <- lm(log(price) ~ . , data=ames_all[1:nrow(ames_train),])
par(mfrow = c(2, 2))
plot(outlier)

From the result, we conclude that none of the points included qualifies as a highly influential leverage point. Therefore, we keep all the observations.

4.3 One-hot Encoding

We take one hot encoding of the factor variables.

# Drop levels with no entries
ames_all <- droplevels(ames_all)

# One-hot encoding using model.matrix()
ames_all <- as.data.frame(model.matrix( ~ . -1, ames_all))

# Standarize weird column names
colnames(ames_all) <- make.names(colnames(ames_all))

# Drop perfectly collinear columns
drop_collinear <- rownames(alias(lm(log(price) ~ . , data=ames_all))$Complete)
ames_all <- ames_all %>% dplyr::select(-all_of(drop_collinear))

4.4 Variance Inflation Factor

We drop variables with high VIF (>10) as they make the estimates of coefficients unstable.

# Calculate Variance Inflation Factor of attributes
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:PreProcess':
## 
##     ellipse
## The following object is masked from 'package:dplyr':
## 
##     recode
vif_df <- data.frame(car::vif(lm(log(price) ~ . , data=ames_all)))
names(vif_df)[1] <- "GVIF"
drop_high_vif <- rownames(vif_df %>% filter(GVIF>10) %>% dplyr::arrange(desc(GVIF)))
ames_all <- ames_all %>% dplyr::select(-all_of(drop_high_vif))

4.5 PCA

In theory, using a reduced dimension data set makes no difference for the outcome of the prediction. PCA only removes the multicollinearity between the fields. We reduce the dimension of the data set by implementing PCA of the continuous numerical variables and later use this data set to see if it improves the performance.

# Calculate Variance Inflation Factor of attributes
num_var <- c("Log.area", "Overall.Qual", "Total.Bsmt.SF", "Garage.Cars", "Year.Built", "Year.Remod.Add", "Full.Bath", "Fireplaces")
dim_reduction <- ames_all %>% dplyr::select(all_of(num_var))
pca <- prcomp(dim_reduction, center=TRUE, scale.=TRUE, rank. = 3)
summary(pca)
## Importance of first k=3 (out of 8) components:
##                           PC1    PC2     PC3
## Standard deviation     2.0028 1.0517 0.88204
## Proportion of Variance 0.5014 0.1383 0.09725
## Cumulative Proportion  0.5014 0.6397 0.73692
library(ggbiplot)
## Loading required package: scales
ggbiplot(pca)
## Warning in sweep(pcobj$x, 2, 1/(d * nobs.factor), FUN = "*"): STATS is longer
## than the extent of 'dim(x)[MARGIN]'
## Warning in sweep(v, 2, d^var.scale, FUN = "*"): STATS is longer than the extent
## of 'dim(x)[MARGIN]'

rest <- ames_all %>% dplyr::select(-all_of(num_var))
ames_pca <- merge(rest, pca$x, by=0, sort=FALSE) %>% dplyr::select(-Row.names)

5 Modeling

# Split the all data frame back into train, test and validation
train <- ames_all[1:nrow(ames_train),]
test <- ames_all[nrow(ames_train)+1:nrow(ames_test),] 
validation <- ames_all[nrow(ames_train)+nrow(ames_test)+1:nrow(ames_validation),]

train_pca <- ames_pca[1:nrow(ames_train),]
test_pca <- ames_pca[nrow(ames_train)+1:nrow(ames_test),] 
validation_pca <- ames_pca[nrow(ames_train)+nrow(ames_test)+1:nrow(ames_validation),]

5.1 Multiple Linear Regression

We run a multiple linear regression on the pre-processed data set including the interaction terms mentioned above. R is directly solving the normal equations to obtain the least square fit: \[\hat{\beta} = (X^{T}X)^{-1}X^{T}Y\]

# Run multiple linear regression on log(price) using the full model
linear_model <- lm(log(price) ~ . , data=train)
summary(linear_model)$adj.r.squared
## [1] 0.8758192

5.1.1 Variable Selection: AIC/BIC Backward Elimination

We run stepwise backward variables elimination using AIC and BIC to find the best model.

\[AIC = 2k - 2log(L)\] \[BIC = klog(n) - 2log(L)\]

# Model selection using AIC
start_time <- Sys.time()
linear_model.AIC <- stepAIC(linear_model, k=2, trace=0)
end_time <- Sys.time()
end_time - start_time
## Time difference of 2.346269 secs
start_time <- Sys.time()
linear_model.BIC <- stepAIC(linear_model, k=log(nrow(train)), trace=0)
end_time <- Sys.time()
end_time - start_time
## Time difference of 2.47857 secs
summary(linear_model.AIC)$adj.r.squared
## [1] 0.8772922
summary(linear_model.BIC)$adj.r.squared
## [1] 0.8763994

linear_model.AIC has a lower adjusted R squared than linear_model or linear_model.BIC.

5.1.2 Residuals

We use linear_model.AIC (suggestion from the AIC based backward elimination) to analyze the residuals.

par(mfrow = c(2, 2))
plot(linear_model.AIC, which = c(1,2,3,4))

From the result, we notice that the leverage for some observations are one. This is due to the fact these observations takes values with only one member. We ignore this case. Next, from the residual plots, we see that the residual for observations 498, 906 and 979 are about 4-6.25 standard deviations away from the estimated mean. We acknowledge that these points are high leverage points, but since their Cook’s distance (scaler metric that indicates the strength of an influence of a single observation on the overall fit) fall well below 0.5, we decide against excluding these observations from our analysis. Besides these outliers, the residual plots suggest normality and homoscadasticity.

5.1.3 RMSE

We calculate RMSE in its original unit ($) using AIC and BIC models.

# Train
# Full
predict.linear_model <- exp(predict(linear_model, train))
## Warning in predict.lm(linear_model, train): prediction from a rank-deficient fit
## may be misleading
resid.linear_model <- train$price - predict.linear_model
rmse.linear_model <- sqrt(mean(resid.linear_model^2))
cat('Linear model (Full) RMSE on train: ', rmse.linear_model, '\n')
## Linear model (Full) RMSE on train:  27751.15
# AIC
predict.linear_model.AIC <- exp(predict(linear_model.AIC, train))
resid.linear_model.AIC <- train$price - predict.linear_model.AIC
rmse.linear_model.AIC <- sqrt(mean(resid.linear_model.AIC^2))
cat('Linear model (AIC) RMSE on train: ', rmse.linear_model.AIC, '\n')
## Linear model (AIC) RMSE on train:  27960.23
# BIC
predict.linear_model.BIC <- exp(predict(linear_model.BIC, train))
resid.linear_model.BIC <- train$price - predict.linear_model.BIC
rmse.linear_model.BIC <- sqrt(mean(resid.linear_model.BIC^2))
cat('Linear model (BIC) RMSE on train: ', rmse.linear_model.BIC, '\n')
## Linear model (BIC) RMSE on train:  28189.23
p1 <- ggplot(aes(x=price, y=predict.linear_model), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Full Model Train")
p2 <- ggplot(aes(x=price, y=predict.linear_model.AIC), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("AIC Model Train")
p3 <- ggplot(aes(x=price, y=predict.linear_model.BIC), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("BIC Model Train")

grid.arrange(
  p1, p2, p3, 
  nrow = 1,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

# Test
# Full
predict.linear_model <- exp(predict(linear_model, test))
## Warning in predict.lm(linear_model, test): prediction from a rank-deficient fit
## may be misleading
resid.linear_model <- test$price - predict.linear_model
rmse.linear_model <- sqrt(mean(resid.linear_model^2))
cat('Linear model (Full) RMSE on test: ', rmse.linear_model, '\n')
## Linear model (Full) RMSE on test:  22412.14
# AIC
predict.linear_model.AIC <- exp(predict(linear_model.AIC, test))
resid.linear_model.AIC <- test$price - predict.linear_model.AIC
rmse.linear_model.AIC <- sqrt(mean(resid.linear_model.AIC^2))
cat('Linear model (AIC) RMSE on test: ', rmse.linear_model.AIC, '\n')
## Linear model (AIC) RMSE on test:  22572.78
# BIC
predict.linear_model.BIC <- exp(predict(linear_model.BIC, test))
resid.linear_model.BIC <- test$price - predict.linear_model.BIC
rmse.linear_model.BIC <- sqrt(mean(resid.linear_model.BIC^2))
cat('Linear model (BIC) RMSE on test: ', rmse.linear_model.BIC, '\n')
## Linear model (BIC) RMSE on test:  22624.94
p1 <- ggplot(aes(x=price, y=predict.linear_model), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Full Model Test")
p2 <- ggplot(aes(x=price, y=predict.linear_model.AIC), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("AIC Model Test")
p3 <- ggplot(aes(x=price, y=predict.linear_model.BIC), data=test) + geom_jitter(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("BIC Model Test")

grid.arrange(
  p1, p2, p3,
  nrow = 1,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

The results suggest that the RMSE for predictions on a training data set for AIC is higher than BIC by about $xxx In the scatter plots showing the predicted value vs. actual value, AIC model seems to be fitting the data better especially at large values. Interestingly, the RMSE on the testing data set is significantly lower than on the training data set for both AIC and BIC models. This result implies:

  1. The training set had many ‘hard’ cases to learn
  2. The testing set had mostly ‘easy’ cases to predict

5.2 Ridge Regression

We use cross validation for hyper parameter tuning (\(\lambda\)), which is the penalty factor for the L2 norm of the coefficients.

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1
set.seed(1)

x_train = as.matrix(train[,-1])
y_train = log(train$price)

x_test = as.matrix(test[,-1])
y_test = log(test$price)

# Returns sequence of models with different lambdas
lambdas <- 10^seq(3, -3, by = -.1)
ridge_model_glmridge = glmnet(x_train, y_train, alpha=0, lambda=lambdas)
cv_ridge <- cv.glmnet(x_train, y_train, alpha=0, lambda=lambdas)
plot(cv_ridge)

optimal_lambda_ridge <- cv_ridge$lambda.min
cat('Optimal lambda for ridge regression: ', optimal_lambda_ridge, '\n')
## Optimal lambda for ridge regression:  0.01258925
predict.ridge_model_train <- exp(predict(ridge_model_glmridge, s=optimal_lambda_ridge, x_train))
resid.ridge_model_train <- train$price - predict.ridge_model_train
rmse.ridge_model_train <- sqrt(mean(resid.ridge_model_train^2))
sse.train <- sum((predict.ridge_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Ridge regression RMSE and R-square on train:\t', rmse.ridge_model_train, '\t', r_square.train, '\n')
## Ridge regression RMSE and R-square on train:  28024.2     0.8828265
p1 <- ggplot(aes(x=price, y=predict.ridge_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Ridge Regression on Train Data")

predict.ridge_model_test <- exp(predict(ridge_model_glmridge, s=optimal_lambda_ridge,  x_test))
resid.ridge_model_test <- test$price - predict.ridge_model_test
rmse.ridge_model_test <- sqrt(mean(resid.ridge_model_test^2))
sse.test <- sum((predict.ridge_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Ridge regression RMSE and R-square on test:\t', rmse.ridge_model_test, '\t', r_square.test, '\n')
## Ridge regression RMSE and R-square on test:   22518.91    0.9041723
p2 <- ggplot(aes(x=price, y=predict.ridge_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Ridge Regression on Test Data")

grid.arrange(
  p1, p2, 
  nrow = 1,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

5.3 Lasso Regression

We use cross validation for hyper parameter tuning (\(\lambda\)), which is the penalty factor for the L1 norm of the coefficients.

set.seed(1)
lambdas <- 10^seq(2, -3, by=-0.1)

lasso_model_glmridge = glmnet(x_train, y_train, alpha=1, lambda=lambdas)
cv_lasso <- cv.glmnet(x_train, y_train, alpha=1, lambda=lambdas)
plot(cv_lasso)

optimal_lambda_lasso <- cv_lasso$lambda.min
cat('Optimal lambda for lasso regression:\t', optimal_lambda_lasso, '\n')
## Optimal lambda for lasso regression:  0.002511886
predict.lasso_model_train <- exp(predict(lasso_model_glmridge, s=optimal_lambda_lasso, x_train))
resid.lasso_model_train <- train$price - predict.lasso_model_train
rmse.lasso_model_train <- sqrt(mean(resid.lasso_model_train^2))
sse.train <- sum((predict.lasso_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Lasso regression RMSE and R-square on train:\t', rmse.lasso_model_train, '\t', r_square.train, '\n')
## Lasso regression RMSE and R-square on train:  28517.89    0.8786617
p1 <- ggplot(aes(x=price, y=predict.lasso_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Lasso Regression on Train Data")

predict.lasso_model_test <- exp(predict(lasso_model_glmridge, s=optimal_lambda_lasso,  x_test))
resid.lasso_model_test <- test$price - predict.lasso_model_test
rmse.lasso_model_test <- sqrt(mean(resid.lasso_model_test^2))
sse.test <- sum((predict.lasso_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Lasso regression RMSE and R-square on test:\t', rmse.lasso_model_test, '\t', r_square.test, '\n')
## Lasso regression RMSE and R-square on test:   22566.63    0.9037657
p2 <- ggplot(aes(x=price, y=predict.lasso_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Lasso Regression on Test Data")

grid.arrange(
  p1, p2, 
  nrow = 1,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

5.4 Elastic Regression

We carry out repeated 10-fold cross validation 5 times to find the optimal parameters (lambda and alpha). We use a randomized grid search.

set.seed(1)
# Set training control
train_cont <- trainControl(method="repeatedcv", number=10, repeats=5, search="random", verboseIter=FALSE)
# Train the model
elastic_model <- train(log(price) ~ ., data=train, method="glmnet", tuneLength=10, trControl=train_cont)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Best tuning parameter
elastic_model$bestTune
##       alpha      lambda
## 2 0.1765568 0.006605359
predict.elastic_model_train <- exp(predict(elastic_model, x_train))
resid.elastic_model_train <- train$price - predict.elastic_model_train
rmse.elastic_model_train <- sqrt(mean(resid.elastic_model_train^2))
sse.train <- sum((predict.elastic_model_train - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('\nElastic regression RMSE and R-square on train:\t', rmse.elastic_model_train, '\t', r_square.train, '\n')
## 
## Elastic regression RMSE and R-square on train:    28221.66    0.8811694
p1 <- ggplot(aes(x=price, y=predict.elastic_model_train), data=train) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Elastic Regression on Train Data")

predict.elastic_model_test <- exp(predict(elastic_model, x_test))
resid.elastic_model_test <- test$price - predict.elastic_model_test
rmse.elastic_model_test <- sqrt(mean(resid.elastic_model_test^2))
sse.test <- sum((predict.elastic_model_test - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Elastic regression RMSE and R-square on test:\t', rmse.elastic_model_test, '\t', r_square.test, '\n')
## Elastic regression RMSE and R-square on test:     22495.49    0.9043716
p2 <- ggplot(aes(x=price, y=predict.elastic_model_test), data=test) + geom_point(alpha=0.5) + geom_abline(intercept=0, slope=1, color='blue') + xlab("Actual Value") + ylab("Predited Value") + ggtitle("Elastic Regression on Test Data")

grid.arrange(
  p1, p2, 
  nrow = 1,
  bottom = textGrob(
    "",
    gp = gpar(fontface = 3, fontsize = 9),
    hjust = 1,
    x = 1
  )
)

5.5 Bayes Linear Regression (Probablistic Modeling)

Need to understand how the coefficients are updated by the observations and how the probabilities are calculated.

# Fit the model: Bayesian
bma <- bas.lm(log(price) ~ . , data=train, prior="BIC", modelprior=uniform())
# Print out the marginal posterior inclusion probability of each variable                
bma
## 
## Call:
## bas.lm(formula = log(price) ~ ., data = train, prior = "BIC", 
##     modelprior = uniform())
## 
## 
##  Marginal Posterior Inclusion Probabilities: 
##            Intercept          Overall.Qual         Total.Bsmt.SF  
##              1.00000               1.00000               1.00000  
##          Garage.Cars            Year.Built        Year.Remod.Add  
##              0.96953               0.56340               0.99994  
##            Full.Bath            Fireplaces        BsmtFin.Type.1  
##              0.04294               0.99993               0.02478  
##       Kitchen.QualFa        Kitchen.QualGd        Kitchen.QualPo  
##              0.02887               0.14319               0.02371  
##       Kitchen.QualTA    Garage.TypeBasment    Garage.TypeBuiltIn  
##              0.99591               0.02455               0.02328  
##   Garage.TypeCarPort       Garage.TypeNone           Bsmt.QualEx  
##              0.02252               0.02826               0.99836  
##          Bsmt.QualFa           Bsmt.QualGd           Bsmt.QualPo  
##              0.07259               0.02742               0.02234  
##     MS.ZoningI..all.        Fireplace.QuFa        Fireplace.QuPo  
##              0.18955               0.02212               0.02256  
##  Exterior.1stAsphShn   Exterior.1stBrkComm   Exterior.1stBrkFace  
##              0.49701               0.68095               0.47708  
##   Exterior.1stCBlock   Exterior.1stCemntBd   Exterior.1stImStucc  
##              0.50194               0.02308               0.02269  
##  Exterior.1stPlywood   Exterior.1stPreCast    Exterior.1stStucco  
##              0.02874               0.49879               0.02805  
##  Exterior.1stWdShing     House.Style1.5Unf     House.Style1Story  
##              0.02688               0.02373               0.99992  
##    House.Style2.5Fin     House.Style2.5Unf     House.Style2Story  
##              0.49597               0.07644               0.99809  
##    House.StyleSFoyer       House.StyleSLvl          Central.AirY  
##              0.02709               0.03495               1.00000  
##             Log.area  Neighborhood.Binned1  Neighborhood.Binned2  
##              1.00000               1.00000               1.00000  
## Neighborhood.Binned3   MS.SubClass.Binned1   MS.SubClass.Binned2  
##              1.00000               0.95878               0.99629  
##  MS.SubClass.Binned3  
##              0.10437
# Top 5 most probably models
summary(bma)
##                      P(B != 0 | Y)    model 1    model 2    model 3    model 4
## Intercept               1.00000000     1.0000     1.0000     1.0000     1.0000
## Overall.Qual            1.00000000     1.0000     1.0000     1.0000     1.0000
## Total.Bsmt.SF           1.00000000     1.0000     1.0000     1.0000     1.0000
## Garage.Cars             0.96953302     1.0000     1.0000     1.0000     1.0000
## Year.Built              0.56340247     1.0000     1.0000     1.0000     1.0000
## Year.Remod.Add          0.99993873     1.0000     1.0000     1.0000     1.0000
## Full.Bath               0.04294058     0.0000     0.0000     0.0000     0.0000
## Fireplaces              0.99992630     1.0000     1.0000     1.0000     1.0000
## BsmtFin.Type.1          0.02477998     0.0000     0.0000     0.0000     0.0000
## Kitchen.QualFa          0.02887293     0.0000     0.0000     0.0000     0.0000
## Kitchen.QualGd          0.14319196     0.0000     0.0000     0.0000     0.0000
## Kitchen.QualPo          0.02370923     0.0000     0.0000     0.0000     0.0000
## Kitchen.QualTA          0.99590679     1.0000     1.0000     1.0000     1.0000
## Garage.TypeBasment      0.02454504     0.0000     0.0000     0.0000     0.0000
## Garage.TypeBuiltIn      0.02327786     0.0000     0.0000     0.0000     0.0000
## Garage.TypeCarPort      0.02252438     0.0000     0.0000     0.0000     0.0000
## Garage.TypeNone         0.02825739     0.0000     0.0000     0.0000     0.0000
## Bsmt.QualEx             0.99835654     1.0000     1.0000     1.0000     1.0000
## Bsmt.QualFa             0.07259172     0.0000     0.0000     0.0000     0.0000
## Bsmt.QualGd             0.02742037     0.0000     0.0000     0.0000     0.0000
## Bsmt.QualPo             0.02233905     0.0000     0.0000     0.0000     0.0000
## MS.ZoningI..all.        0.18955098     0.0000     0.0000     0.0000     0.0000
## Fireplace.QuFa          0.02211737     0.0000     0.0000     0.0000     0.0000
## Fireplace.QuPo          0.02256484     0.0000     0.0000     0.0000     0.0000
## Exterior.1stAsphShn     0.49701018     0.0000     0.0000     0.0000     1.0000
## Exterior.1stBrkComm     0.68095257     1.0000     1.0000     1.0000     1.0000
## Exterior.1stBrkFace     0.47708109     1.0000     1.0000     1.0000     1.0000
## Exterior.1stCBlock      0.50194005     1.0000     0.0000     1.0000     1.0000
## Exterior.1stCemntBd     0.02308044     0.0000     0.0000     0.0000     0.0000
## Exterior.1stImStucc     0.02268727     0.0000     0.0000     0.0000     0.0000
## Exterior.1stPlywood     0.02873601     0.0000     0.0000     0.0000     0.0000
## Exterior.1stPreCast     0.49878631     0.0000     0.0000     1.0000     1.0000
## Exterior.1stStucco      0.02804627     0.0000     0.0000     0.0000     0.0000
## Exterior.1stWdShing     0.02687561     0.0000     0.0000     0.0000     0.0000
## House.Style1.5Unf       0.02372789     0.0000     0.0000     0.0000     0.0000
## House.Style1Story       0.99991793     1.0000     1.0000     1.0000     1.0000
## House.Style2.5Fin       0.49596883     0.0000     0.0000     0.0000     0.0000
## House.Style2.5Unf       0.07644254     0.0000     0.0000     0.0000     0.0000
## House.Style2Story       0.99809045     1.0000     1.0000     1.0000     1.0000
## House.StyleSFoyer       0.02709384     0.0000     0.0000     0.0000     0.0000
## House.StyleSLvl         0.03494704     0.0000     0.0000     0.0000     0.0000
## Central.AirY            1.00000000     1.0000     1.0000     1.0000     1.0000
## Log.area                1.00000000     1.0000     1.0000     1.0000     1.0000
## Neighborhood.Binned1    0.99999814     1.0000     1.0000     1.0000     1.0000
## Neighborhood.Binned2    0.99999853     1.0000     1.0000     1.0000     1.0000
## Neighborhood.Binned3    1.00000000     1.0000     1.0000     1.0000     1.0000
## MS.SubClass.Binned1     0.95878285     1.0000     1.0000     1.0000     1.0000
## MS.SubClass.Binned2     0.99628982     1.0000     1.0000     1.0000     1.0000
## MS.SubClass.Binned3     0.10437164     0.0000     0.0000     0.0000     0.0000
## BF                              NA     1.0000     1.0000     1.0000     1.0000
## PostProbs                       NA     0.0031     0.0031     0.0031     0.0031
## R2                              NA     0.8788     0.8788     0.8788     0.8788
## dim                             NA    21.0000    20.0000    22.0000    23.0000
## logmarg                         NA -1601.4311 -1601.4311 -1601.4311 -1601.4311
##                         model 5
## Intercept                1.0000
## Overall.Qual             1.0000
## Total.Bsmt.SF            1.0000
## Garage.Cars              1.0000
## Year.Built               1.0000
## Year.Remod.Add           1.0000
## Full.Bath                0.0000
## Fireplaces               1.0000
## BsmtFin.Type.1           0.0000
## Kitchen.QualFa           0.0000
## Kitchen.QualGd           0.0000
## Kitchen.QualPo           0.0000
## Kitchen.QualTA           1.0000
## Garage.TypeBasment       0.0000
## Garage.TypeBuiltIn       0.0000
## Garage.TypeCarPort       0.0000
## Garage.TypeNone          0.0000
## Bsmt.QualEx              1.0000
## Bsmt.QualFa              0.0000
## Bsmt.QualGd              0.0000
## Bsmt.QualPo              0.0000
## MS.ZoningI..all.         0.0000
## Fireplace.QuFa           0.0000
## Fireplace.QuPo           0.0000
## Exterior.1stAsphShn      1.0000
## Exterior.1stBrkComm      1.0000
## Exterior.1stBrkFace      1.0000
## Exterior.1stCBlock       1.0000
## Exterior.1stCemntBd      0.0000
## Exterior.1stImStucc      0.0000
## Exterior.1stPlywood      0.0000
## Exterior.1stPreCast      0.0000
## Exterior.1stStucco       0.0000
## Exterior.1stWdShing      0.0000
## House.Style1.5Unf        0.0000
## House.Style1Story        1.0000
## House.Style2.5Fin        0.0000
## House.Style2.5Unf        0.0000
## House.Style2Story        1.0000
## House.StyleSFoyer        0.0000
## House.StyleSLvl          0.0000
## Central.AirY             1.0000
## Log.area                 1.0000
## Neighborhood.Binned1     1.0000
## Neighborhood.Binned2     1.0000
## Neighborhood.Binned3     1.0000
## MS.SubClass.Binned1      1.0000
## MS.SubClass.Binned2      1.0000
## MS.SubClass.Binned3      0.0000
## BF                       1.0000
## PostProbs                0.0031
## R2                       0.8788
## dim                     22.0000
## logmarg              -1601.4311
# RMSE on Training Data
# Highest Probability Model
pred.train.HPM <- predict(bma, newdata=train, estimator="HPM")
pred.HPM.rmse <- sqrt(mean((exp(pred.train.HPM$fit) - train$price)^2))
sse.train <- sum((exp(pred.train.HPM$fit) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Highest Probability Model (RMSE & R Square) :\t', pred.HPM.rmse, '\t', r_square.train, '\n')
## Highest Probability Model (RMSE & R Square) :     28189.23    0.8814424
# Median Probability Model
pred.train.MPM <- predict(bma, newdata=train, estimator="MPM")
pred.MPM.rmse <- sqrt(mean((exp(pred.train.MPM$fit) - train$price)^2))
sse.train <- sum((exp(pred.train.MPM$fit) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Median Probability Model (RMSE & R Square) :\t', pred.MPM.rmse, '\t', r_square.train, '\n')
## Median Probability Model (RMSE & R Square) :  28357.3     0.8800245
# Average over all the models (computation heavy)
#pred.train.BPM <- predict(bma, newdata=train, estimator='BPM')
#pred.BPM.rmse <- sqrt(mean((exp(pred.train.BPM$fit) - train$price)^2))
#sse.train <- sum((exp(pred.train.MPM$fit) - train$price)^2)
#sst.train <- sum((train$price - mean(train$price))^2)
#r_square.train <- 1 - sse.train/sst.train
#cat('Average over all the models:\t', pred.BPM.rmse, '\t', r_square.train)
# RMSE on Testing Data
# Highest Probability Model
pred.test.HPM <- predict(bma, newdata=test, estimator="HPM")
pred.HPM.rmse <- sqrt(mean((exp(pred.test.HPM$fit) - test$price)^2))
sse.test <- sum((exp(pred.test.HPM$fit) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Highest Probability Model (RMSE & R Square) :\t', pred.HPM.rmse, '\t', r_square.test, '\n')
## Highest Probability Model (RMSE & R Square) :     22624.94    0.9032678
# Median Probability Model
pred.test.MPM <- predict(bma, newdata=test, estimator="MPM")
pred.MPM.rmse <- sqrt(mean((exp(pred.test.MPM$fit) - test$price)^2))
sse.test <- sum((exp(pred.test.MPM$fit) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Median Probability Model (RMSE & R Square) :\t', pred.MPM.rmse, '\t', r_square.test, '\n')
## Median Probability Model (RMSE & R Square) :  22672.57    0.9028601
# Average over all the models
#pred.test.BPM <- predict(bma, newdata=test, estimator='BPM')
#pred.BPM.rmse <- sqrt(mean((exp(pred.test.BPM$fit) - test$price)^2))
#sse.test <- sum((exp(pred.test.MPM$fit) - test$price)^2)
#sst.test <- sum((test$price - mean(test$price))^2)
#r_square.test <- 1 - sse.test/sst.test
#cat('Average over all the models:\t', pred.BPM.rmse, '\t', r_square.test)

5.6 Kernel Regression (Non Parametric)

Usually with just one predictor due to the curse of dimensionality. As the number of variables increases, so does the computational cost quadratically. We find the optimal bandwidth using npregbw().

library(np)
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-10)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
# https://bookdown.org/egarpor/NP-UC3M/kre-i-np.html
# The spinner can be omitted with
options(np.messages = FALSE)

# npregbw() computes by default the least squares CV bandwidth associated to a local *constant* fit
bw_lc <- npregbw(log(price) ~ Log.area + Overall.Qual + Total.Bsmt.SF, data=train)
bw_ll <- npregbw(log(price) ~ Log.area + Overall.Qual + Total.Bsmt.SF, data=train, regtype="ll")
# Locally constant Gaussian kernel regression on the training data 
kre_lc_train <- npreg(bws=bw_lc)
# Locally linear Gaussian kernel regression on the training data 
kre_ll_train <- npreg(bws=bw_ll)

# Visualization only good for one predictor. Otherwise, it could be a mess.
#plot(kre_lc, col=2, type = "o")
#points(train$Log.area, log(train$price))
#plot(kre_ll, col=2, type = "o")
#points(train$Log.area, log(train$price))
# Locally linear Gaussian kernel regression on the testing data 
kre_ll_test <- npreg(bw_ll, newdata=test, y.eval=TRUE)
kre_lc_test <- npreg(bw_lc, newdata=test, y.eval=TRUE)

pred.kre_ll <- kre_ll_test$mean
rmse <- sqrt(mean((exp(pred.kre_ll) - test$price)^2))
sse.test <- sum((exp(pred.kre_ll) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Locally Linear Gaussian Kernel Regression (RMSE & R Square) :\t', rmse, '\t', r_square.test, '\n')
## Locally Linear Gaussian Kernel Regression (RMSE & R Square) :     33730.05    0.7850041
pred.kre_lc <- kre_lc_test$mean
rmse <- sqrt(mean((exp(pred.kre_lc) - test$price)^2))
sse.test <- sum((exp(pred.kre_lc) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Locally Constant Gaussian Kernel Regression (RMSE & R Square) :\t', rmse, '\t', r_square.test, '\n')
## Locally Constant Gaussian Kernel Regression (RMSE & R Square) :   33632.61    0.7862444

5.7 Regression Tree

Pruning (stopping the tree to grow and avoiding splitting a partition if the split does not significantly improves the overall quality of the model) is performed by caret package, which invokes rpart method for automatically testing different values of cp. Then it chooses the optimal cp that maximize the cross-validation accuracy and fits the final best CART model that explains the best the data. tuneLength is for specifying the number of possible cp values to evaluate. Default value is 3, here we’ll use 10.

library(rpart)
library(rpart.plot)

regression_tree <- train(log(price) ~ . , data=train, method="rpart", trControl=trainControl("cv", number=10), tuneLength=10 )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Plot model error vs different values of complexity parameter
plot(regression_tree)

# Print the best tuning parameter cp that minimize the model RMSE
regression_tree$bestTune
##           cp
## 1 0.01266821
# Plot the final tree model
rpart.plot(regression_tree$finalModel)

# Decision rules in the model
regression_tree$finalModel
## n= 1000 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1000 176.727500 12.01847  
##    2) Overall.Qual< 0.3318801 629  60.414460 11.80213  
##      4) Overall.Qual< -1.130546 86  15.652640 11.40766  
##        8) Total.Bsmt.SF< -0.6260206 48   7.877014 11.19275  
##         16) Overall.Qual< -2.592973 9   2.355814 10.71026 *
##         17) Overall.Qual>=-2.592973 39   2.942480 11.30410 *
##        9) Total.Bsmt.SF>=-0.6260206 38   2.758337 11.67913 *
##      5) Overall.Qual>=-1.130546 543  29.260410 11.86461  
##       10) Garage.Cars< -0.3339475 261   9.425580 11.73711 *
##       11) Garage.Cars>=-0.3339475 282  11.665930 11.98260  
##         22) Full.Bath< -0.09302467 134   2.999426 11.87981 *
##         23) Full.Bath>=-0.09302467 148   5.968635 12.07567 *
##    3) Overall.Qual>=0.3318801 371  36.962170 12.38526  
##      6) Garage.Cars< 1.018618 245  12.165020 12.24190  
##       12) Total.Bsmt.SF< 1.085312 204   8.135974 12.19347 *
##       13) Total.Bsmt.SF>=1.085312 41   1.169294 12.48290 *
##      7) Garage.Cars>=1.018618 126   9.972419 12.66400  
##       14) Total.Bsmt.SF< 1.456459 70   3.599219 12.51377 *
##       15) Total.Bsmt.SF>=1.456459 56   2.818360 12.85179 *
# Train
best_tree <- regression_tree$finalModel

predictions.train <- predict(best_tree, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Regression Tree on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## Regression Tree on Train (RMSE & R Square) :  40227.7     0.7585577
# Test
predictions.test <- predict(best_tree, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Regression Tree on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## Regression Tree on test (RMSE & R Square) :   41332.31    0.6771687

5.8 Random Forest Regression (Ensembling)

Here, we will use 5-fold cross validation 5 times (repeated CV). mtry is the number of variables randomly sampled as candidates at each split.

library(randomForest)
# Set up a 5-fold cross validation
#control <- trainControl(method="cv", number=5)
# Set up a 5-fold cross validation 5 times (repeated CV)
control <- trainControl(method="repeatedcv", number=5, repeats=5)

random_forest_model <- train(log(price) ~ . , data=train, method="rf", trControl=control)

# Best Model
random_forest_model$bestTune
##   mtry
## 2   25
# Train
best_forest <- random_forest_model$finalModel

predictions.train <- predict(best_forest, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('Regression Tree on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## Regression Tree on Train (RMSE & R Square) :  13599.28    0.9724073
# Test
predictions.test <- predict(best_forest, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('Regression Tree on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## Regression Tree on test (RMSE & R Square) :   27335.35    0.8587965

5.9 Gradient Boosting Regression (Ensembling)

We use caret to run a grid search to find the optimal hyper parameters (eta, max_depth, min_child_weight). Note that this will take some time since we are not running xgboost directly using the library with the efficient data type (xgb.DMatrix).

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
xgb_grid = expand.grid(
  nrounds = 1000,
  eta = c(0.1, 0.05, 0.01),
  max_depth = c(2, 3, 4, 5, 6),
  gamma = 0,
  colsample_bytree=1,
  #min_child_weight=c(1, 2, 3, 4 ,5),
  min_child_weight=c(4),
  subsample=1
)

Let caret find the best hyperparameter values (using 5 fold cross validation).

start_time <- Sys.time()
control <-trainControl(method="cv", number=5)
#xgb_caret <- train(log(price) ~ . , data=train, method='xgbTree', trControl=control, tuneGrid=xgb_grid) 
#xgb_caret$bestTune
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.001997948 secs
# Time difference of 20.82481 secs for one set of parameters

The optimal parameters are Max_depth=3, eta=0.05, Min_child_weight=4.

default_param<-list(
        objective = "reg:squarederror",
        booster = "gbtree",
        eta=0.05, #default = 0.3
        gamma=0,
        max_depth=3, #default=6
        min_child_weight=4, #default=1
        subsample=1,
        colsample_bytree=1
)
# put our testing & training data into two separate Dmatrixs objects
dtrain <- xgb.DMatrix(data=x_train, label=y_train)
dtest <- xgb.DMatrix(data=x_test, label=y_test)

We do a cross validation on the number of rounds using the method xgb.cv from xgboost.

xgbcv <- xgb.cv(params=default_param, data=dtrain, nrounds=500, nfold=5, showsd=T, stratified=T, print_every_n=40, early_stopping_rounds=10, maximize=F)
## [1]  train-rmse:10.951333+0.006891   test-rmse:10.951263+0.029457 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [41] train-rmse:1.433526+0.001023    test-rmse:1.435024+0.016870 
## [81] train-rmse:0.229844+0.001913    test-rmse:0.248510+0.016353 
## [121]    train-rmse:0.123191+0.002988    test-rmse:0.160469+0.022712 
## [161]    train-rmse:0.114077+0.002535    test-rmse:0.157740+0.023954 
## [201]    train-rmse:0.108690+0.002335    test-rmse:0.157171+0.024501 
## [241]    train-rmse:0.104254+0.002650    test-rmse:0.156626+0.024581 
## Stopping. Best iteration:
## [244]    train-rmse:0.103846+0.002685    test-rmse:0.156606+0.024545
#train the model using the best iteration found by cross validation
xgb_model <- xgb.train(data=dtrain, params=default_param, nrounds=254)

# Train
predictions.train <- predict(xgb_model, dtrain)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('XGB on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## XGB on Train (RMSE & R Square) :  20013.28    0.9402415
# Test
predictions.test <- predict(xgb_model, dtest)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('XGB on test (RMSE & R Square) :\t\t', rmse.test, '\t', r_square.test, '\n')
## XGB on test (RMSE & R Square) :       26328.89    0.869003
#view variable importance plot
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance(feature_names=colnames(x_train), model=xgb_model)
xgb.ggplot.importance(importance_matrix=mat[1:20], rel_to_first=TRUE)

5.10 Neural Network Regression

We use caret’s method nnet to run a 5-fold cross validation to find the best number of nodes and weight decay factor.

library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
#http://sebastianderi.com/aexam/hld_MODEL_neural.html

set.seed(1)

# Step 1: SELECT TUNING PARAMETERS
# Set range of tuning parameters (layer size [number of nodes] and weight decay)
tune_grid_neural <- expand.grid(size=c(1:5), decay=c(0, 0.05, 0.1, 0.5, 1))

# Set other constrains to be imposed on network (to keep computation manageable)
max_size_neaural <- max(tune_grid_neural$size)
max_weights_neural <- max_size_neaural*(nrow(train) + 1) + max_size_neaural + 1

# Step 2: SELECT TUNING METHOD
# set up train control object, which specifies training/testing technique
control_neural <- trainControl(method="cv", number=5)

# Step 3: TRAIN MODEL
nn <- train(log(price) ~ . , data=train, method="nnet", linout=TRUE, tuneGrid=tune_grid_neural, trControl=control_neural, trace=FALSE, maxit=100)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
nn_model <- nn$finalModel
# Train
predictions.train <- predict(nn_model, train)
rmse.train <- sqrt(mean((exp(predictions.train) - train$price)^2))
sse.train <- sum((exp(predictions.train) - train$price)^2)
sst.train <- sum((train$price - mean(train$price))^2)
r_square.train <- 1 - sse.train/sst.train
cat('NN on Train (RMSE & R Square) :\t', rmse.train, '\t', r_square.train, '\n')
## NN on Train (RMSE & R Square) :   27312.29    0.8887041
# Test
predictions.test <- predict(nn_model, test)
rmse.test <- sqrt(mean((exp(predictions.test) - test$price)^2))
sse.test <- sum((exp(predictions.test) - test$price)^2)
sst.test <- sum((test$price - mean(test$price))^2)
r_square.test <- 1 - sse.test/sst.test
cat('NN on test (RMSE & R Square) :\t', rmse.test, '\t', r_square.test, '\n')
## NN on test (RMSE & R Square) :    24324.22    0.8881917

6 Validation

eval_metrics = function(model, validation){
    predictions <- exp(predict(model, validation))
    residuals <- validation$price - predictions
    rmse <- sqrt(mean(residuals^2))
    adj_r2 <- summary(model)$adj.r.squared
    cat(rmse, '\t', adj_r2, '\n')
}

eval_metrics_gml = function(model, validation, optimal_lambda){
    x_validate = as.matrix(validation[,-1])
    y_validate = log(validation$price)
    predictions <- exp(predict(model, s=optimal_lambda, x_validate))
    residuals <- validation$price - predictions
    rmse <- sqrt(mean(residuals^2))
    sse <- sum((predictions - validation$price)^2)
    sst <- sum((validation$price - mean(validation$price))^2)
    r_square <- 1 - sse/sst
    cat(rmse, '\t', r_square, '\n')
}

eval_metrics_elastic_gml = function(model, validation){
    x_validate = as.matrix(validation[,-1])
    y_validate = log(validation$price)
    predictions <- exp(predict(model, x_validate))
    residuals <- validation$price - predictions
    rmse <- sqrt(mean(residuals^2))
    sse <- sum((predictions - validation$price)^2)
    sst <- sum((validation$price - mean(validation$price))^2)
    r_square <- 1 - sse/sst
    cat(rmse, '\t', r_square, '\n')
}

eval_metrics_bma = function(model, validation){
    predictions <- predict(bma, newdata=validation, estimator="HPM")
    rmse <- sqrt(mean((exp(predictions$fit) - validation$price)^2))
    sse <- sum((exp(predictions$fit) - validation$price)^2)
    sst <- sum((validation$price - mean(validation$price))^2)
    r_square <- 1 - sse/sst
    cat(rmse, '\t', r_square, '\n')
}

eval_metrics_kernel = function(bw, validation){
    # Removing outlier... Non-paramteric models are not good for extrapolation
    validation <- validation %>% filter(price!=284000 & Overall.Qual!=1.4286999 & Total.Bsmt.SF!=5.1786677591 & Log.area!=0.444833205)
    
    kre <- npreg(bw, newdata=validation, y.eval=TRUE)
    predictions <- kre$mean
    rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
    sse <- sum((exp(predictions) - validation$price)^2)
    sst <- sum((test$price - mean(validation$price))^2)
    r_square <- 1-sse/sst
    cat(rmse, '\t', r_square, '\n')
}

eval_metrics_others = function(model, validation){
    predictions <- predict(model, validation)
    rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
    sse <- sum((exp(predictions) - validation$price)^2)
    sst <- sum((validation$price - mean(validation$price))^2)
    r_square <- 1 - sse/sst
    cat(rmse, '\t', r_square, '\n')
}

eval_metrics_xgb = function(model, validation){
    x_validate = as.matrix(validation[,-1])
    y_validate = log(validation$price)
    dval <- xgb.DMatrix(data=x_validate, label=y_validate)
    predictions <- predict(model, dval)
    rmse <- sqrt(mean((exp(predictions) - validation$price)^2))
    sse <- sum((exp(predictions) - validation$price)^2)
    sst <- sum((validation$price - mean(validation$price))^2)
    r_square <- 1 - sse/sst
    cat(rmse, '\t', r_square, '\n')
}
cat('RMSE and Adjusted R Square on Validation Data\n\n')
## RMSE and Adjusted R Square on Validation Data
cat('linear full:\t\t')
## linear full:     
eval_metrics(linear_model, validation)
## 23069.92      0.8758192
cat('linear AIC:\t\t')
## linear AIC:      
eval_metrics(linear_model.AIC, validation)
## 23208.37      0.8772922
cat('linear BIC:\t\t')
## linear BIC:      
eval_metrics(linear_model.BIC, validation)
## 23616.35      0.8763994
cat('ridge:\t\t\t')
## ridge:           
eval_metrics_gml(ridge_model_glmridge, validation, optimal_lambda_lasso)
## 23073.38      0.8824473
cat('lasso:\t\t\t')
## lasso:           
eval_metrics_gml(lasso_model_glmridge, validation, optimal_lambda_ridge)
## 23669.87      0.8762908
cat('elastic net:\t\t')
## elastic net:     
eval_metrics_elastic_gml(elastic_model, validation)
## 23084.77      0.8823313
cat('baseysian linear:\t')
## baseysian linear:    
eval_metrics_bma(bma, validation)
## 23616.35      0.8768497
cat('kernel regression:\t')
## kernel regression:   
eval_metrics_kernel(bw_ll, validation)
## 26203.26      0.8802653
cat('regression tree:\t')
## regression tree: 
eval_metrics_others(best_tree, validation)
## 37963.24      0.6817734
cat('random forest:\t\t')
## random forest:       
eval_metrics_others(best_forest, validation)
## 24319.89      0.869403
cat('xgboost model:\t\t')
## xgboost model:       
eval_metrics_xgb(xgb_model, validation)
## 22949.22      0.883709
cat('nn model:\t\t')
## nn model:        
eval_metrics_others(nn_model, validation)
## 22975.79      0.8834396

The resulting RMSE is smaller compared to the RMSEs from both the training and the testing data.

# Calculate proportion of observations that fall within prediction intervals
interval.linear_model.AIC <- exp(predict(linear_model.AIC, validation, interval="prediction"))
coverage.prob.linear_model.AIC <- mean(validation$price > interval.linear_model.AIC[,"lwr"] & validation$price < interval.linear_model.AIC[,"upr"])
coverage.prob.linear_model.AIC
## [1] 0.9633028

Finally, 97% of the observations contain the true price of the house within their 95% predictive confidence (or credible) intervals. From this result, we can state that our final model properly reflects uncertainty.


7 Conclusion


We studied the housing price data set in detail in the EDA section. We found that there is a strong multicolinearity between many variables (both numerical and categorical). Strong correlations and statistical tests for significance indicate that attributes like Overall.Qual, area and Total.Bsmt.SF make the best predictors for the response variable price. These attributes all have a positive association with the response variable. Our final linear regression achieved RMSE-score of 21872.24 and the coverage percentage of 97% (i.e. percentage of observations containing the true price within their 95% predictive confidence (or credible) intervals.

Referring back to the original purpose of this study, which is to identify the houses that are underpriced in the market, those house could be identified as any house under the blue line in the plot AIC Model Validation (see above).